{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qjubHCEzYtG8"
      },
      "source": [
        "### Step 1. Download InfoRE dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zCBJzJi6BE_o"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "%%bash\n",
        "mkdir -p /content/data\n",
        "cd /content/data\n",
        "wget https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k.zip\n",
        "# unzip -P BroughtToYouByInfoRe 25hours.zip\n",
        "unzip infore_16k.zip"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6C47hb9nYzmB"
      },
      "source": [
        "### Step 2. Normalize audio clip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Hp9TK8PbBcQM"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "!sudo apt install -y sox\n",
        "!pip install soundfile librosa\n",
        "!pip install onnxruntime==1.11.1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "FW45D8xM9Mcc",
        "outputId": "8d7ea7a9-ea5a-48ca-88fe-37dd4ed55d9b"
      },
      "outputs": [],
      "source": [
        "!mkdir -p /content/infore_16k\n",
        "from pathlib import Path\n",
        "import os\n",
        "from tqdm.cli import tqdm\n",
        "\n",
        "wavs = sorted(Path(\"/content/data/InfoRe\").glob(\"*.wav\"))\n",
        "for path in tqdm(wavs):\n",
        "    out = Path(\"/content/infore_16k\") / path.name\n",
        "    cmd = f\"sox {path} -c 1 -e signed-integer -b 16 -r 16k --norm=-3 {out}\"\n",
        "    os.system(cmd)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kooiBrsQY5sQ"
      },
      "source": [
        "### Step 3. Denoise using DNS-Challenge's baseline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hsXeNkZ3Xacj"
      },
      "outputs": [],
      "source": [
        "!git clone https://github.com/microsoft/DNS-Challenge\n",
        "%cd DNS-Challenge/NSNet2-baseline/\n",
        "!git checkout -f 8b87a33b2892f147b5c7ad39ea978453730db269\n",
        "!python run_nsnet2.py -i /content/infore_16k/ -o /content/infore_16k_denoised -m ./nsnet2-20ms-baseline.onnx"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T5JtZwKgZI4r"
      },
      "source": [
        "### Step 4. Zip the denoised dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "eeFggV0uYop_"
      },
      "outputs": [],
      "source": [
        "%cd /content\n",
        "!cp /content/data/InfoRe/*.txt ./infore_16k_denoised\n",
        "!cd ./infore_16k_denoised; zip -r ../infore_16k_denoised.zip ."
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "prepare_infore_dataset.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
